knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
#import libraries
#PLEASE NOTE: PLEASE INSTALL BELOW PACKAGES PRIOR TO RUNNING IF NOT PREVIOUSLY INSTALLED
library(tidyverse)
library(lubridate)
library(knitr)
library(scales)
library(forcats)
library(plotly)
library(DT)
#import data to read and store as variable
shooting_data <- read_csv("https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv")
#update date fields to proper class
shooting_data$OCCUR_DATE <- mdy(shooting_data$OCCUR_DATE)
shooting_data$HOUR <- hour(shooting_data$OCCUR_TIME)
# No output in this chunk
# Create dataframe of shootings grouped by BORO and hour they occured
hourly_data <- shooting_data %>%
group_by(BORO, HOUR) %>%
summarise(Shootings = n()) %>%
group_by(BORO) %>%
mutate(Total_Shootings = sum(Shootings)) %>%
arrange(desc(Total_Shootings)) %>%
mutate(BORO = fct_reorder(BORO, Total_Shootings, max),
PCT_Shootings = Shootings/Total_Shootings,
Percent_Shootings = percent(PCT_Shootings, 0.01))
# Create a plot showing what hour shootings take place
# Use facet wrap to display distinct graphs for the boroughs
line_plot <- hourly_data %>%
ggplot(aes(x = HOUR, y = PCT_Shootings, label = Percent_Shootings, label2 = BORO)) +
geom_area(stat = "identity", fill = "darkturquoise") +
scale_y_continuous(labels = percent) +
theme_classic() +
facet_wrap(~ BORO) +
theme(panel.spacing = unit(2, "lines")) +
labs(x = NULL, y = NULL, title = "Percentage of Overall Shootings by Hour of Day")
ggplotly(line_plot, tooltip = c("x", "label", "label2"))
# Get both maximum and minimum shooting volume totals for borough/hour
# Use stringr library to print nicely in table
min_max_hours <- hourly_data %>%
group_by(BORO) %>%
mutate(`Max Shootings` = max(Shootings),
`Min Shootings` = min(Shootings),
TIME = ifelse(HOUR == 0, "12 AM",
ifelse(HOUR < 12, str_c(HOUR, " AM"),
ifelse(HOUR == 12, "12 PM", str_c(HOUR-12, " PM"))))) %>%
ungroup() %>%
filter(Shootings == `Max Shootings` | Shootings == `Min Shootings`) %>%
mutate(Category = ifelse(Shootings == `Max Shootings`, "Max", "Min"),
Values = str_c(TIME, " (", Shootings, ")")) %>%
select(BORO, Values, Category) %>%
pivot_wider(names_from = BORO, values_from = Values)
kable(min_max_hours, caption = "Highest and Lowest Shootings by Hour")
| Category | BROOKLYN | BRONX | QUEENS | MANHATTAN | STATEN ISLAND |
|---|---|---|---|---|---|
| Min | 9 AM (73) | 8 AM (48), 9 AM (48) | 8 AM (28) | 7 AM (20), 9 AM (20) | 10 AM (4) |
| Max | 11 PM (817) | 12 AM (599) | 2 AM (304) | 12 AM (270) | 4 AM (59) |
# Get shooting volume by borough and month
month_grouped <- shooting_data %>%
mutate(Month = month(OCCUR_DATE, label = TRUE, abbr = FALSE)) %>%
group_by(BORO, Month) %>%
summarise(Shootings = n())
#Show bar graphs for each borough and shooting volume based on hour
month_bar <- month_grouped %>%
ggplot(aes(x = fct_rev(Month), y = Shootings, label = Month, label2 = BORO)) +
geom_bar(stat = "identity", fill = "darkturquoise") +
theme_classic() +
labs(x = NULL, y = NULL,
title = "NYC Volume of Shootings from 2006-2020 by Month and Borough") +
coord_flip() +
facet_wrap(~ BORO, scales = "free", ncol = 2) +
theme(panel.spacing = unit(3, "lines"))
ggplotly(month_bar, tooltip = c("label", "y", "label2"))
# Similar to hourly table get min/max values and format to print nicely in data table
min_max_months <- month_grouped %>%
group_by(BORO) %>%
mutate(`Max Shootings` = max(Shootings),
`Min Shootings` = min(Shootings)) %>%
ungroup() %>%
filter(Shootings == `Max Shootings` | Shootings == `Min Shootings`) %>%
mutate(Category = ifelse(Shootings == `Max Shootings`, "Max", "Min"),
Values = str_c(Month, " (", Shootings, ")")) %>%
select(-`Max Shootings`, -`Min Shootings`, -Shootings, -Month) %>%
pivot_wider(names_from = BORO, values_from = Values)
kable(min_max_months, caption = "Highest and Lowest Shootings by Month")
| Category | BRONX | BROOKLYN | MANHATTAN | QUEENS | STATEN ISLAND |
|---|---|---|---|---|---|
| Min | February (312) | February (444) | February (146) | February (215) | February (32) |
| Max | August (824) | July (1260) | August (330) | August (379) | July (84) |
# Not analyzing by Borough at this time
# Instead of looking at individual hours/months try and group by time/month ranges
shooting_data <- shooting_data %>%
mutate(Month_Group = ifelse(month(OCCUR_DATE) < 4, "Jan-Mar",
ifelse(month(OCCUR_DATE) < 7, "Apr-Jun",
ifelse(month(OCCUR_DATE) < 10, "Jul-Sep", "Oct-Dec"))),
Hour_Group = ifelse(HOUR < 3 | HOUR > 22, "11PM - 2AM",
ifelse(HOUR < 8, "3 - 7AM",
ifelse(HOUR < 13, "8AM - 12PM",
ifelse(HOUR < 18, "1 - 5PM", "6 - 10PM")))))
# Create factors for Months and Hours
# Ensure levels are sorted (earliest first in graphs/tables)
shooting_data$Hour_Group <- factor(shooting_data$Hour_Group,
levels = c("3 - 7AM", "8AM - 12PM", "1 - 5PM",
"6 - 10PM", "11PM - 2AM"))
shooting_data$Month_Group <- factor(shooting_data$Month_Group,
levels = c("Jan-Mar", "Apr-Jun", "Jul-Sep", "Oct-Dec"))
month_time <- shooting_data %>%
group_by(Month_Group, Hour_Group) %>%
summarise(Shootings = n())
# Plot month/hour ranges data
month_time_plot <- month_time %>%
ggplot(aes(x = Month_Group, y = Shootings, fill = Hour_Group)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Month Range", y = "Volume of Shootings", fill = "Time of Day",
title = "Shootings Volume by Month and Time of Day") +
theme_classic() +
scale_fill_brewer(palette = "Pastel2", direction = -1)
ggplotly(month_time_plot)
# Use similar data based on hour/month ranges to create a predictive model
# Borough not included
# Model Goal: Given the total count of shootings in a given year predict the volume of shootings given a month/hour range combination
hour_model_data <- shooting_data %>%
mutate(Year = year(OCCUR_DATE)) %>%
#filter(Year >= 2019) %>%
group_by(Hour_Group, Month_Group, Year) %>%
summarise(Shootings = n()) %>%
group_by(Year) %>%
mutate(Total_Shootings = sum(Shootings))
hour_model <- lm(Shootings ~ Total_Shootings + Hour_Group + Month_Group, data = hour_model_data)
#Print output of model
summary(hour_model)
##
## Call:
## lm(formula = Shootings ~ Total_Shootings + Hour_Group + Month_Group,
## data = hour_model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.193 -17.056 -2.953 11.708 123.007
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -38.273333 7.742803 -4.943 1.30e-06 ***
## Total_Shootings 0.050000 0.004032 12.402 < 2e-16 ***
## Hour_Group8AM - 12PM -42.333333 4.977887 -8.504 9.73e-16 ***
## Hour_Group1 - 5PM -3.450000 4.977887 -0.693 0.489
## Hour_Group6 - 10PM 56.383333 4.977887 11.327 < 2e-16 ***
## Hour_Group11PM - 2AM 58.366667 4.977887 11.725 < 2e-16 ***
## Month_GroupApr-Jun 29.680000 4.452358 6.666 1.32e-10 ***
## Month_GroupJul-Sep 49.800000 4.452358 11.185 < 2e-16 ***
## Month_GroupOct-Dec 18.440000 4.452358 4.142 4.52e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.27 on 291 degrees of freedom
## Multiple R-squared: 0.7527, Adjusted R-squared: 0.7459
## F-statistic: 110.7 on 8 and 291 DF, p-value: < 2.2e-16
#Add in new column to model dataframe with predictions
hour_model_data$Prediction <- predict(hour_model)
#Create column showing how far off prediction was for each data point
hour_model_data <- hour_model_data %>%
mutate(Prediction_Difference = Prediction - Shootings)
# Create graphs of models performance
# Due to large data, only printing graphs for certain years in data set
model_graph_data <- hour_model_data %>%
select(-Total_Shootings,-Prediction_Difference) %>%
pivot_longer(cols = c(-Hour_Group, -Year, -Month_Group))
model_graph_2008 <- model_graph_data %>%
filter(Year == 2008) %>%
ggplot(aes(x = Month_Group, y = value, fill = name)) +
geom_bar(stat = "identity", position = "dodge") +
theme_classic() +
labs(x = NULL, y = NULL, fill = "Metric", title = "Predicted Total Shootings 2008") +
scale_fill_brewer(palette = "Pastel1") +
facet_wrap(~ Hour_Group, scales = "free") +
theme(panel.spacing = unit(3, "lines"))
model_graph_2015 <- model_graph_data %>%
filter(Year == 2015) %>%
ggplot(aes(x = Month_Group, y = value, fill = name)) +
geom_bar(stat = "identity", position = "dodge") +
theme_classic() +
labs(x = NULL, y = NULL, fill = "Metric", title = "Predicted Total Shootings 2015") +
scale_fill_brewer(palette = "Pastel1") +
facet_wrap(~ Hour_Group, scales = "free") +
theme(panel.spacing = unit(3, "lines"))
model_graph_2020 <- model_graph_data %>%
filter(Year == 2020) %>%
ggplot(aes(x = Month_Group, y = value, fill = name)) +
geom_bar(stat = "identity", position = "dodge") +
theme_classic() +
labs(x = NULL, y = NULL, fill = "Metric", title = "Predicted Total Shootings 2020") +
scale_fill_brewer(palette = "Pastel1") +
facet_wrap(~ Hour_Group, scales = "free") +
theme(panel.spacing = unit(3, "lines"))
ggplotly(model_graph_2008)
ggplotly(model_graph_2015)
ggplotly(model_graph_2020)
#get average prediction difference for reference
pred_diff_avg <- round(mean(abs(hour_model_data$Prediction_Difference)), 2)
Average Prediction Value Error (Absolute Value) 19.52
#Shrink model data to most recent years and print table
condensed_model_table <- hour_model_data %>%
select(-Total_Shootings) %>%
mutate(Prediction = round(Prediction),
Prediction_Difference = round(Prediction_Difference)) %>%
filter(Year >= 2015) %>%
arrange(Hour_Group, Month_Group, desc(Year))
datatable(condensed_model_table, caption = "Model Performance 2015-2020",
options = list(pageLength = 30))